Load libraries

library(dplyr)
library(tidyverse)
library(mlr)
library(gridExtra)
library(ggfortify)
library(edgeR)
library(org.Hs.eg.db)
library(RColorBrewer)
library(gplots)
library(tidyr)
library(pheatmap)
library(reshape2)
library(survminer)
library(rms)
library(pheatmap)
library(ggplot2)

my_theme <- function(){
    theme(
      axis.text = element_text(size=8),
      axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
      axis.title = element_text(size =8),
      panel.grid.major = element_line(color = "gray"),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = "#ffffff"),
      strip.background = element_rect(fill = "black", color = "black", size =0.5),
      strip.text = element_text(face = "bold", size = 8, color = "white"),
      legend.position = "bottom",
      legend.justification = "center",
      legend.background = element_blank(),
      panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
    )
}

theme_set(my_theme())

mycolors=c("darkred","black","#004431","#000c44","#2d1600","purple","#202302")
myfillcolors=c("#d10c0c","grey10","#1e6651","#213989","#5b431a","purple","#393f03")

Clinical data exploration

1. Prepare data and explore basic demographics

clin_data <- read.csv("sc3_Training_ClinAnnotations.csv")

## group age into bins for visualization purposes
labs <- c(paste(seq(20, 80, by=15), seq(20+15-1, 95-1, by=15), sep="-"))
clin_data$agegrp <- cut(clin_data$D_Age, breaks=c(seq(20,80, by=15), Inf), labels=labs, right=F)

## assumption: events where HR_FLAG it is not TRUE (event did not occur) are assumed to be censored
## 131 out of 583 events occurred and rest are censored
clin_data$HR_FLAG <- clin_data$HR_FLAG==TRUE

## check % of NAs present in each column of the dataset
nacount <- apply(clin_data, 2, function(x){
  sum(is.na(x))
})
nacount <- as.data.frame(nacount, row.names=names(clin_data))
nacount <- add_rownames(nacount) %>% dplyr::mutate(percentNA=round(nacount/583*100,2))%>%
  dplyr::filter(percentNA>=0.00)

## remove columns where NAs >50%
cols_to_remove <- nacount[nacount$percentNA>50,]$rowname
clin_data <- clin_data[, !(names(clin_data) %in% cols_to_remove)]

## remove unnecessary columns -> keep only D_PFS as time column,  remove file info columns, 
## study and patient status columns also removed (all same)
clin_data <- clin_data[, -c(1, 5:6, 8, 10:11,13:17)]

## remove NAs (only 20 in D_ISS column)
clin_data <- clin_data[complete.cases(clin_data),]

## age by gender distribution plot
clin_data %>% ggplot(aes(x=agegrp, color=D_Gender, fill=D_Gender))+geom_bar()+
  ggtitle("Age distribution by gender")+xlab("Age group [years]")+ylab("Number of subjects")+theme_minimal()

2. Explore effect of factors like age and ISS on PFS

par(mfrow=c(1,2))

## Kaplan-Meier curve for Death events in patients stratified by age and ISS
surv_plots <- list()
fit_iss=survival::survfit(survival::Surv(D_PFS,HR_FLAG)~D_ISS,data=clin_data)
ggsurvplot(fit_iss, pval = TRUE, conf.int = TRUE)

#autoplot(fit_iss)+scale_color_manual(values=mycolors)+scale_fill_manual(values=myfillcolors)

fit_age = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~agegrp,data=clin_data)
ggsurvplot(fit_age, pval = TRUE, conf.int = TRUE)

#autoplot(fit_age)+scale_color_manual(values=mycolors)+scale_fill_manual(values=myfillcolors)

These figures indicate that the patients who were in their third stage and belonged to an older age group have higher PFS risk. Now, that it’s established that both age and ISS can be important predictors of MM death or relapse, a small 2-predictor model is built to apply a Cox regression model and predict the PFS time using the main effects of the two predictors - age and ISS.

## simple 2-predictor model with coxph using only age and ISS 
age_iss_model <- survival::coxph(survival::Surv(D_PFS,HR_FLAG) ~ 
                          D_Age + D_ISS , data=clin_data)


summary(age_iss_model)
## Call:
## survival::coxph(formula = survival::Surv(D_PFS, HR_FLAG) ~ D_Age + 
##     D_ISS, data = clin_data)
## 
##   n= 563, number of events= 122 
## 
##           coef exp(coef) se(coef)     z Pr(>|z|)    
## D_Age 0.026554  1.026909 0.008763 3.030  0.00244 ** 
## D_ISS 0.581850  1.789346 0.122814 4.738 2.16e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## D_Age     1.027     0.9738     1.009     1.045
## D_ISS     1.789     0.5589     1.407     2.276
## 
## Concordance= 0.662  (se = 0.025 )
## Likelihood ratio test= 39.91  on 2 df,   p=2e-09
## Wald test            = 38.42  on 2 df,   p=5e-09
## Score (logrank) test = 40.15  on 2 df,   p=2e-09

Again, it appears that both age and ISS have a statistically significant effect, as predictors in the model.

3. Explore effects of cytogenetic features on PFS

## simple 13-predictor model using only cyto features with coxph - to detect which cyto features could be relevant as significant predictors
cyto_feat_model <- survival::coxph(survival::Surv(D_PFS,HR_FLAG) ~ 
                                     CYTO_predicted_feature_01+CYTO_predicted_feature_02+CYTO_predicted_feature_03+
                                     CYTO_predicted_feature_05+CYTO_predicted_feature_06+CYTO_predicted_feature_08+
                                     CYTO_predicted_feature_12+CYTO_predicted_feature_13+CYTO_predicted_feature_14+
                                     CYTO_predicted_feature_15+CYTO_predicted_feature_16+CYTO_predicted_feature_17+
                                     CYTO_predicted_feature_18, data=clin_data)
summary(cyto_feat_model)
## Call:
## survival::coxph(formula = survival::Surv(D_PFS, HR_FLAG) ~ CYTO_predicted_feature_01 + 
##     CYTO_predicted_feature_02 + CYTO_predicted_feature_03 + CYTO_predicted_feature_05 + 
##     CYTO_predicted_feature_06 + CYTO_predicted_feature_08 + CYTO_predicted_feature_12 + 
##     CYTO_predicted_feature_13 + CYTO_predicted_feature_14 + CYTO_predicted_feature_15 + 
##     CYTO_predicted_feature_16 + CYTO_predicted_feature_17 + CYTO_predicted_feature_18, 
##     data = clin_data)
## 
##   n= 563, number of events= 122 
## 
##                               coef exp(coef) se(coef)      z Pr(>|z|)   
## CYTO_predicted_feature_01  0.41477   1.51402  0.20968  1.978  0.04791 * 
## CYTO_predicted_feature_02  0.74766   2.11204  0.73900  1.012  0.31168   
## CYTO_predicted_feature_03 -1.01689   0.36172  0.33074 -3.075  0.00211 **
## CYTO_predicted_feature_05  0.18337   1.20126  0.21800  0.841  0.40024   
## CYTO_predicted_feature_06  0.24968   1.28362  0.26331  0.948  0.34301   
## CYTO_predicted_feature_08  0.27432   1.31563  0.29038  0.945  0.34481   
## CYTO_predicted_feature_12  0.58770   1.79985  1.64586  0.357  0.72103   
## CYTO_predicted_feature_13 -0.14107   0.86843  0.38509 -0.366  0.71413   
## CYTO_predicted_feature_14  1.04206   2.83506  0.48209  2.162  0.03065 * 
## CYTO_predicted_feature_15  0.02776   1.02815  0.27154  0.102  0.91856   
## CYTO_predicted_feature_16 -0.51062   0.60012  1.01129 -0.505  0.61361   
## CYTO_predicted_feature_17       NA        NA  0.00000     NA       NA   
## CYTO_predicted_feature_18 -0.04945   0.95175  0.34209 -0.145  0.88506   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                           exp(coef) exp(-coef) lower .95 upper .95
## CYTO_predicted_feature_01    1.5140     0.6605   1.00382    2.2835
## CYTO_predicted_feature_02    2.1120     0.4735   0.49621    8.9896
## CYTO_predicted_feature_03    0.3617     2.7646   0.18917    0.6917
## CYTO_predicted_feature_05    1.2013     0.8325   0.78357    1.8416
## CYTO_predicted_feature_06    1.2836     0.7790   0.76613    2.1506
## CYTO_predicted_feature_08    1.3156     0.7601   0.74467    2.3244
## CYTO_predicted_feature_12    1.7998     0.5556   0.07150   45.3101
## CYTO_predicted_feature_13    0.8684     1.1515   0.40827    1.8473
## CYTO_predicted_feature_14    2.8351     0.3527   1.10205    7.2933
## CYTO_predicted_feature_15    1.0282     0.9726   0.60384    1.7506
## CYTO_predicted_feature_16    0.6001     1.6663   0.08269    4.3556
## CYTO_predicted_feature_17        NA         NA        NA        NA
## CYTO_predicted_feature_18    0.9518     1.0507   0.48678    1.8608
## 
## Concordance= 0.61  (se = 0.026 )
## Likelihood ratio test= 20.78  on 12 df,   p=0.05
## Wald test            = 20.07  on 12 df,   p=0.07
## Score (logrank) test = 21.3  on 12 df,   p=0.05

Cox regression model built using only the 13 cytogenetic features suggests that only 3 out of 13 features have statistically significant effect on PFS. Therefore, these 3 features will be selected for further analysis. Next, Kaplan-meier curves for these 3 features will be plotted.

par(mfrow=c(1,3))
fit_cyto1 = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~CYTO_predicted_feature_01,data=clin_data)
fit_cyto3 = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~CYTO_predicted_feature_03,data=clin_data)
fit_cyto14 = survival::survfit(survival::Surv(D_PFS,HR_FLAG)~CYTO_predicted_feature_14,data=clin_data)
ggsurvplot(fit_cyto1,
           pval = TRUE, conf.int = TRUE)

ggsurvplot(fit_cyto3,
           pval = TRUE, conf.int = TRUE)

ggsurvplot(fit_cyto14,
           pval = TRUE, conf.int = TRUE)

Again, these plots suggest that the 3 features can be important predictors in our model, therefore they will be for the further downstream analysis.

## include only these 3 cyto features as they seem to have significant effect on survival 
clin_data <- clin_data %>% dplyr::select(-c("CYTO_predicted_feature_02", 
                                            "CYTO_predicted_feature_05", "CYTO_predicted_feature_06", "CYTO_predicted_feature_08", 
                                            "CYTO_predicted_feature_12", "CYTO_predicted_feature_13",  
                                            "CYTO_predicted_feature_15", "CYTO_predicted_feature_16", "CYTO_predicted_feature_17", 
                                            "CYTO_predicted_feature_18"))

4. Descriptive statistics

## Descriptive statistics

psych::describeBy(clin_data$D_PFS,group=clin_data$CYTO_predicted_feature_01)
## 
##  Descriptive statistics by group 
## group: 0
##    vars   n   mean    sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 425 547.42 320.8    521  527.44 326.17   1 1572  1571 0.56     0.01 15.56
## ----------------------------------------------------------------------------- 
## group: 1
##    vars   n  mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 138 518.5 332.02    480  492.79 335.07   1 1497  1496 0.65    -0.06 28.26
psych::describeBy(clin_data$D_PFS,group=clin_data$CYTO_predicted_feature_03)
## 
##  Descriptive statistics by group 
## group: 0
##    vars   n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 464 528.08 315.45    490   507.6 329.14   1 1556  1555 0.57    -0.12 14.64
## ----------------------------------------------------------------------------- 
## group: 1
##    vars  n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 99 597.74 355.08    623  575.86 281.69   1 1572  1571 0.51     0.08 35.69
psych::describeBy(clin_data$D_PFS,group=clin_data$CYTO_predicted_feature_14)
## 
##  Descriptive statistics by group 
## group: 0
##    vars   n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 551 541.49 325.73    517  520.02 327.65   1 1572  1571 0.57    -0.04 13.88
## ----------------------------------------------------------------------------- 
## group: 1
##    vars  n mean     sd median trimmed   mad min max range  skew kurtosis    se
## X1    1 12  487 198.91  512.5   485.6 294.3 237 751   514 -0.11    -1.73 57.42
psych::describeBy(clin_data$D_PFS,group=clin_data$agegrp)
## 
##  Descriptive statistics by group 
## group: 20-34
##    vars n  mean     sd median trimmed    mad min  max range skew kurtosis   se
## X1    1 2 937.5 130.81  937.5   937.5 137.14 845 1030   185    0    -2.75 92.5
## ----------------------------------------------------------------------------- 
## group: 35-49
##    vars  n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 49 649.43 345.94    710  641.73 330.62  16 1373  1357 0.09    -0.79 49.42
## ----------------------------------------------------------------------------- 
## group: 50-64
##    vars   n   mean    sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 233 552.45 304.7    541  539.07 299.49   1 1497  1496 0.38     -0.2 19.96
## ----------------------------------------------------------------------------- 
## group: 65-79
##    vars   n  mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 234 521.2 324.03  456.5  492.15 280.21   1 1572  1571 0.85     0.62 21.18
## ----------------------------------------------------------------------------- 
## group: 80-94
##    vars  n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 45 440.64 356.54    331  398.95 370.65  36 1433  1397  0.9     -0.1 53.15
psych::describeBy(clin_data$D_PFS,group=clin_data$D_ISS)
## 
##  Descriptive statistics by group 
## group: 1
##    vars   n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 189 593.67 323.77    595  574.72 309.86   1 1572  1571 0.55     0.09 23.55
## ----------------------------------------------------------------------------- 
## group: 2
##    vars   n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 209 534.31 311.37    510  518.53 318.76   1 1436  1435 0.48    -0.14 21.54
## ----------------------------------------------------------------------------- 
## group: 3
##    vars   n   mean     sd median trimmed    mad min  max range skew kurtosis    se
## X1    1 165 486.87 330.68    427  456.62 363.24   1 1556  1555 0.76     0.08 25.74

The descriptive analysis of the relevant factors from the clinical data show that median PFS time is different across the different groups of features. It will be interesting to see how these features combined with expression data perform in the predictive analysis and contribute to the prediction of death or relapse risk in newly diagnosed MM patients.

Expression data exploration

Next, expression dataset will be explored and later integrated with the clinical data to train a predictive model. The expression dataset is filtered for only those MM relevant genes based on literature research. Furthermore, only those samples for which clinical data is also available will be used for the further analysis. This way it becomes to easier to integrate the two datasets to build a predictive model.

1. Prepare data

exp_data <- read.csv("MMRF_CoMMpass_IA9_E74GTF_Salmon_entrezID_TPM_hg19.csv")

## list the risk genes - based on literature research

riskgenes <- c("KRAS", "NRAS", "DIS3", "TENT5C",
               "BRAF","HUWE1", "TP53", "TRAF3",
               "EGR1", "ATM", "H1-4", "FGFR3", "UBR5", "PRKD2", "CYLD", "ACTG1", 
               "IRF4", "MAX", "KMT2C", "CREBBP", "CCND1", "ARID1A", "EPAS1", 
               "ERC2", "PRC1", "CSGALNACT1", "PHF19", "NSD2")

# Remove first column from expdata - countdata, contains only the counts for all samples.
countdata_initial <- exp_data[,-1]

# Store EntrezGeneID as rownames
rownames(countdata_initial) <- exp_data[,1]

# make dgelist object
dge_initial <- DGEList(countdata_initial)

## get annotations for entrez ids
ann <- select(org.Hs.eg.db,keys=rownames(dge_initial$counts),columns=c("ENTREZID","SYMBOL","GENENAME"))

##double check that the ENTREZID column matches exactly to our rownames in our count data.
print(paste0("Entrez IDs in expression data = Entrez IDs in count data: True for all ", table(ann$ENTREZID==rownames(dge_initial$counts)), " Entrez IDs"))
## [1] "Entrez IDs in expression data = Entrez IDs in count data: True for all 24128 Entrez IDs"
## add this information to expression dataset
exp_data$genes <- ann$SYMBOL

## filter expression dataset for risk genes
mm_risk <- exp_data %>% dplyr::filter(genes %in% riskgenes)

## filter for samples where clinical info also available 
mm_risk_expdata <- mm_risk[, colnames(mm_risk) %in% clin_data$RNASeq_transLevelExpFileSamplId]

## bring columns in expression data in the correct order as in the clinical dataset
mm_risk_expdata <- mm_risk_expdata[clin_data$RNASeq_transLevelExpFileSamplId]

##rename entrez id and gene columns
mm_risk_expdata$Entrez_Id <- mm_risk$X
mm_risk_expdata$Gene <- mm_risk$genes

## make new countdata using the filtered dataset
countdata <- mm_risk_expdata[,-(564:565)]
# Store EntrezGeneID as rownames
rownames(countdata) <- mm_risk_expdata[,564]

##Make sure that the column names are now the same as sample name in the clinical data file. This is good because it means our sample information in clinical data is in the same order as the columns in countdata
print(paste0("Column names = Sample names: True for all ", table(colnames(countdata)==clin_data$RNASeq_transLevelExpFileSamplId), " samples"))
## [1] "Column names = Sample names: True for all 563 samples"
## make a dgelist object for visualizations, qc checks
dge <- DGEList(countdata)

2. Quality control

Here, as I have subsetted the expression dataset for MM relevant genes, the filtering of lowly expressed genes (which is an important step in a standard RNASeq analysis pipeline), will be skipped and no differential analysis will be performed, as all the samples belong to the same group (newly diagnosed MM) and I want to include all these genes later in the prediction model. So, as next step, quality control on all the samples in the dataset will be performed to check that the data is of good quality, and that the samples are as we would expect.

## add clinical covariates (age+sex) to DGEList object
group_age <- factor(clin_data$agegrp)
group_sex <- factor(clin_data$D_Gender)

# Add the group information into the DGEList
dge$samples$age <- group_age
dge$samples$gender <- group_sex

## qc checks
## library size and distribution
barplot(dge$samples$lib.size/1e06, names=colnames(dge), las=2, ann=FALSE, cex.names=0.2)
mtext(side = 1, text = "Samples", line = 4)
mtext(side = 2, text = "Library size (millions)", line = 3)
title("Barplot of library sizes")

# examine the distributions of the raw counts using log of the counts.
# Get log2 counts per million
logcounts <- cpm(dge,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2, cex.axis=0.2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")

From the boxplots it is oberved that the overall distributions of raw log-intensities are not identical but still not very different. If a sample is really far above or below the blue horizontal there my be a need to investigate that sample further. Therefore, for now all the samples will be kept. Next, multi-dimensional scaling (MDS) plots will be used as they are an incredibly useful tool for quality control and checking for outliers.

## multi-dimensional scaling plots

# set up colour schemes for age and gender
par(mfrow=c(1,2))
col.sex <- c("purple","orange")[as.factor(clin_data$D_Gender)]
col.age <- c("blue","red","black","green", "purple", "orange")[clin_data$agegrp]

#MDS with disease stage colouring
plotMDS(dge,col=col.sex)
# Let's add a legend to the plot so we know which colours correspond to which stage
legend("topleft",fill=c("purple","orange"),legend=levels(as.factor(clin_data$D_Gender)))
# Add a title
title("Sex")

# Similarly for age

plotMDS(dge,col=col.age)
legend("topleft",fill=c("blue","red","black","green", "purple", "orange"),legend=levels(clin_data$agegrp),cex=0.5)
title("Age")

No abnormalities are observed across the samples, as there are no clear outliers in the MDS plots. However, one would expect to observe a clear clustering of same age or sex groups and this is something that is not observed in these plots. Still, all the samples in the dataset will be kept for further downstream analysis, as none of them stand out in any way.

Next, the two datasets will be merged.

## reshape the expression dataset so that genes become column names
final_exp_data <- mm_risk_expdata[,-564] %>%
  gather(key = key, value = value, 1:563) %>%
  spread(key = names(mm_risk_expdata)[565], value = "value")

## rename the sample id column to merge with clinical data in the next step
final_exp_data <- final_exp_data %>% dplyr::rename(RNASeq_transLevelExpFileSamplId=key)

## merge clinical and expression data
clin_exp_merge <- merge(x=clin_data, y=final_exp_data, by="RNASeq_transLevelExpFileSamplId", all.x=T)

## change column names not suitable for R
clin_exp_merge <- clin_exp_merge %>% dplyr::rename(H1_4 = "H1-4")

3. Correlation of MM relevant genes with PFS

## correlation of genes with event time
corres_genes <- cor(clin_exp_merge[,c(5, 12:39)], method="pearson", use="pairwise.complete.obs")

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}

upper_tri <- get_upper_tri(corres_genes)

melted_cormat <- melt(upper_tri, na.rm = TRUE)
# plot Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  geom_text(aes(label = round(value, 2)), size=1.5)+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

The correlation of MM relevant gene expression levels with PFS are represented in the heatmap. Positive correlations are red and negative correlations are blue. The plot indicates that some genes, namely CCND1, H1_4, HUWEI, KRAS, MAX, NRAS, PHF19, PRC1, TENT5C, TP53 and UBR5 show negative correlation with PFS. It will be interesting to see if these features will appear to be important predictors and how they impact the survival risk.

Modelling

1. Feature exploration: Attribution of each predictor to the predicted risk

## remove irrelevant columns from the dataset
train_data <- clin_exp_merge %>% dplyr::select(-c("RNASeq_transLevelExpFileSamplId", "Patient", "agegrp", "D_Gender"))

## generate random forest SRC importance plots
rfi=generateFilterValuesData(surv.task,method=c("randomForestSRC_importance"))%>%.$data%>%ggplot(aes(x=reorder(name,value),y=value,fill=reorder(name,value)))+geom_bar(alpha=0.8,stat="identity",color="black",show.legend=F)+scale_x_discrete("Features")+coord_flip()

## generate Cox-PH model univariate scores
uv=generateFilterValuesData(surv.task,imp.learner=makeLearner("surv.coxph"),method="univariate.model.score")%>%.$data%>%ggplot(aes(x=reorder(name,value),y=value,fill=reorder(name,value)))+geom_bar(alpha=0.8,stat="identity",color="black",show.legend=F)+scale_x_discrete("Features")+coord_flip()

## plot
grid.arrange(rfi,uv, ncol=2)

Results show some features are not important predictors, however, they still might contribute to the PFS risk (based on the univariate model). Thus, all features from the dataset will be included in our model.

2. Benchmark study: CoxPH vs Random forest SRC

Based on 5-fold cross-validation for comparing the performance of the 2 survival analysis algorithms

## make survival task
surv.task <- makeSurvTask(data=train_data, target = c("D_PFS", "HR_FLAG"))


## z-score normalize all numeric features 
surv.task <- normalizeFeatures(surv.task, method="standardize", cols=c("D_Age", "ACTG1", "ARID1A", "ATM", 
                                                                       "BRAF", "CCND1", "CREBBP", "CSGALNACT1", "CYLD", "DIS3", "EGR1", 
                                                                       "EPAS1", "ERC2", "FGFR3", "H1_4", "HUWE1", "IRF4", "KMT2C", "KRAS", 
                                                                       "MAX", "NRAS", "NSD2", "PHF19", "PRC1", "PRKD2", "TENT5C", "TP53", 
                                                                       "TRAF3", "UBR5"))

#Benchmark study : comparing 2 different algorithms for survival task

# Creating a list of 2 learners
svlearners=list(
  makeLearner("surv.coxph"),
  makeLearner("surv.randomForestSRC")
)

## 5-fold cross-validation
rdesc=makeResampleDesc("CV", iters=5, predict = "both")

# Initalising the Benchmark study
set.seed(123)
svbnmrk=benchmark(svlearners,surv.task,rdesc)

bmrkdata=getBMRPerformances(svbnmrk, as.df = TRUE)%>%.[,-1]

## plot and check the performance of the two learners
## Distribution of C-index of 2 survival algorithms, based on a 5-fold cross-validation
par(mfrow=c(1,2))
plotBMRRanksAsBarChart(svbnmrk)+scale_fill_manual(values=myfillcolors)

ggplot(bmrkdata)+geom_path(aes(x=iter,y=cindex,color=learner.id),size=1,alpha=0.8)+facet_wrap(~learner.id,scales="free",ncol=2)+scale_color_manual(values=myfillcolors)

Both learners perform equally well, however, proceed with the random forest SRC model, as from my previous experience, RF provides more interpretative results and can identify non-linear effects of all features.

3. Build Random Forest SRC model

## make final learner (random forest SRC)
svlearner <- makeLearner("surv.randomForestSRC")

## set parameter space
params <- makeParamSet(makeDiscreteParam("mtry", c(2,4,6,8,10)), makeDiscreteParam("ntree", c(3,10,50,100,300,1000)))

## define type of search (GridsearchCV)
ctrl <- makeTuneControlGrid()

## define CV scheme (5-fold CV)
rdesc=makeResampleDesc("CV", iters=5, predict = "both")

set.seed(2356)

## tune hyperparameters
tune <- tuneParams(learner=svlearner, task=surv.task, resampling=rdesc, par.set=params, control = ctrl,
                   show.info=T)

## change params (take only optimal params)
svlearner_tune <- setHyperPars(makeLearner("surv.randomForestSRC"), mtry=tune$x$mtry, ntree=tune$x$ntree)

svlearner_tune$par.vals <- list(importance=T)

## resampling - to anticipate how the model might possibly perform on a new dataset
## the CV scheme in mlr by default makes 80/20 split of the training dataset
cv <- resample(svlearner_tune, surv.task, rdesc, measures=cindex, models=T, extract=getFeatureImportance)

#train a model
rforestsrc <- train(svlearner_tune, surv.task)

4. Performance evaluation and feature importance

par(mfrow=c(1,2))
## performance evaluation - check c-index across 5 iterations

c_index_df <- cv$measures.test

##plot
c_index_plot <- ggplot(c_index_df)+geom_path(aes(x=iter,y=cindex),size=1,alpha=0.8)+scale_color_manual(values=myfillcolors)

## get feature importance using the cross validation set (average of all models)
imp_df_cv <- rbind(cv$extract[[1]]$res,
                   cv$extract[[2]]$res,
                   cv$extract[[3]]$res,
                   cv$extract[[4]]$res,
                   cv$extract[[5]]$res)%>% dplyr::group_by(variable) %>% dplyr::summarise(mean=mean(importance), sd=sd(importance)) %>% arrange(desc(mean))

imp_plot <- ggplot(imp_df_cv, aes(x=reorder(variable, mean), y=mean, fill=mean))+
  geom_bar(stat="identity", position="dodge")+coord_flip()+
  ylab("Variable importance")+xlab("")+ggtitle("Information Value Summary")+
  guides(fill=F)+scale_fill_gradient(low="red", high="blue")+theme_minimal()

grid.arrange(c_index_plot, imp_plot, ncol=2)

5. Kaplan-Meier curves for the top 5 important genes

par(mfrow=c(2,3))
## first:divide patients into two groups using gene expression median as a cutoff
clin_exp_merge = clin_exp_merge %>%
  mutate(PHF19_exp = case_when(
    PHF19 > quantile(PHF19, 0.5) ~ 'PHF19_High',
    PHF19 < quantile(PHF19, 0.5) ~ 'PHF19_Low',
    TRUE ~ NA_character_
  ))

clin_exp_merge = clin_exp_merge %>%
  mutate(PRC1_exp = case_when(
    PRC1 > quantile(PRC1, 0.5) ~ 'PRC1_High',
    PRC1< quantile(PRC1, 0.5) ~ 'PRC1_Low',
    TRUE ~ NA_character_
  ))

clin_exp_merge = clin_exp_merge %>%
  mutate(MAX_exp = case_when(
    MAX > quantile(MAX, 0.5) ~ 'MAX_High',
    MAX< quantile(MAX, 0.5) ~ 'MAX_Low',
    TRUE ~ NA_character_
  ))

clin_exp_merge = clin_exp_merge %>%
  mutate(CCND1_exp = case_when(
    CCND1 > quantile(CCND1, 0.5) ~ 'CCND1_High',
    CCND1< quantile(CCND1, 0.5) ~ 'CCND1_Low',
    TRUE ~ NA_character_
  ))

clin_exp_merge = clin_exp_merge %>%
  mutate(CSGALNACT1_exp = case_when(
    CSGALNACT1> quantile(CSGALNACT1, 0.5) ~ 'CSGALNACT1_High',
    CSGALNACT1< quantile(CSGALNACT1, 0.5) ~ 'CSGALNACT1_Low',
    TRUE ~ NA_character_
  ))

fit_phf19 = survival::survfit(Surv(D_PFS, HR_FLAG) ~ PHF19_exp, data = clin_exp_merge)
fit_prc1 = survival::survfit(Surv(D_PFS, HR_FLAG) ~ PRC1_exp, data = clin_exp_merge)
fit_max = survival::survfit(Surv(D_PFS, HR_FLAG) ~ MAX_exp, data = clin_exp_merge)
fit_ccnd1 = survival::survfit(Surv(D_PFS, HR_FLAG) ~ CCND1_exp, data = clin_exp_merge)
fit_csg = survival::survfit(Surv(D_PFS, HR_FLAG) ~ CSGALNACT1_exp, data = clin_exp_merge)

## plot
ggsurvplot(fit_phf19, pval = T, conf.int = T)

ggsurvplot(fit_prc1, pval = T, conf.int = T)

ggsurvplot(fit_max, pval = T, conf.int = T)

ggsurvplot(fit_ccnd1, pval = T, conf.int = T)

ggsurvplot(fit_csg, pval = T, conf.int = T)